writeLines("************************************************************************************************************************")
writeLines("************************************************************************************************************************")

# 	Filename: imputation_source.R
# 	Description: Source script to perform multiple imputation of the original data set.
writeLines("Description: Source script to perform multiple imputation of the original data set..")
# 	Author: Barry Hashimoto
# 	Date: December 2019
# 	For: Barry Hashimoto, "Autocratic Consent to International Law: the Case of the International Criminal Court's Jurisdiction," International Organization.

timestamp()
setwd("/Users/barryhashimoto/Desktop/HashimotoReplicationFile2019")

set.seed(100)

suppressPackageStartupMessages({library(foreign); library(Amelia); library(plyr); library(doBy); library(caret); library(parallel); library(future)})

writeLines(""); writeLines("")
writeLines("----------------------------------------------------------------------------------------------------------------------")
writeLines("Part I: Preprocess data, initialize MI model, and run MI model to produce 10 imputed data sets. Computationally intensive.")

 load("iccdata.RData")
 data <- as.data.frame(data)
 rm(all)
  
###############################################################################################################################
# Process the unimputed data, then package it for delivery to the imputation model.

 # Constrain percent variables unit range, excluding 0.0 and 1.0 so the imputation model does not crash.
  data$urbanpoppercent <- ifelse(data$urbanpoppercent==1, (data$urbanpoppercent - 0.001), data$urbanpoppercent)
  data$nunn_desert <- ifelse(data$nunn_desert==0, (data$nunn_desert + 0.1)/100, data$nunn_desert/100)
  data$nunn_tropical <- ifelse(data$nunn_tropical==0, data$nunn_tropical + 0.1, data$nunn_tropical)
  data$nunn_tropical <- ifelse(data$nunn_tropical==100, data$nunn_tropical - 0.1, data$nunn_tropical)
  data$nunn_tropical <- data$nunn_tropical/100

# Attach data.
  attach(data)

# Create a data frame of the ID vars to be merged with the Amelia output below.
  id.vars <- subset(data, select = c("ccode", "country", "lcode", "leader", "year.quarter", 
		"einQTR", "eoutQTR", "ratifyrome", "quartersinoffice.since1998Q3", "quarters.since1998Q3"))

# Variables to be constrained to (0,1) range.
  percents <- as.data.frame(cbind(urbanpoppercent, nunn_desert, nunn_tropical, allunemp, maleunemp))

# Nominal variables.
  nominals <- as.data.frame(cbind(legaltrad2, vdem.regime))

# Ordinal variables -- for use in the imputation model below, including an interaction to improve imputation accuracy.
  ordinals <- as.data.frame(cbind(leaderexitsoffice, RomeStatuteRatification, 
  	pre98conflict, x.ucdp.incidence, cowwar, democracy, democracy2, democracy*RomeStatuteRatification))
           
# Continuous variables. These include some interactions, which may improve imputation accuracy.
continuous.vars <- as.data.frame(cbind(demaid.scaled, multilataid.scaled, imfcredit.scaled, dod.scaled, capital.scaled,
 vdem.ruleoflaw, polyarchy, libdem, partipdem, delibdem, egaldem,
 eth, rel, medianIMR, ptssum.mean,
 gdp.scaled, loggrowth, logoilrent, rug, logland,
 logdeaths, logrefugees,logsoldiers, logsoldierspercent, logexpmil, logpop,
 loginternaldeaths8997, logcwdays45to98, logicwdays45to98, 
 age, year, quartersinoffice, 
 exit.trend.country, exit.trend.single, deaths.trend.country, deaths.trend.single, 
 democracy*capital.scaled,
 polyarchy*RomeStatuteRatification,
 vdem.ruleoflaw*RomeStatuteRatification,
 democracy*vdem.ruleoflaw,
 loggrowth*gdp.scaled,
 loggrowth*logpop,
 logland*logpop,
 loggrowth*logland,
 vdem.ruleoflaw*gdp.scaled,
 logsoldierspercent*logpop, 
 logsoldiers*logpop, 
 logsoldierspercent*logland, 
 logsoldiers*logland)) 

# Put these data into an object named "set" for entry into the imputation model.
  set <- as.data.frame(cbind(region, quarter, id.vars, continuous.vars, nominals, ordinals, percents))

# Identify the variable names. Important note: do not put cross-sectional and time-series variables for the imputation model into idvars object. Only put in ID variables that must carry over to the multiply imputed data sets. 
  id.varnames <- names(id.vars) 
  ord.varnames <- names(ordinals)
  nom.varnames <- names(nominals)
  logistic.varnames <- names(percents)
  
###############################################################################################################################
# Set Bayesian priors for certain variables to obtain plausible imputations and achieve model stability and faster imputations.

  prior.vars <- c(which(names(set)=="vdem.ruleoflaw"),    # 1
                  which(names(set)=="logoilrent"),        # 2
                  which(names(set)=="logrefugees"),       # 3
                  which(names(set)=="logexpmil"),         # 4: here this is expenditures as pct. of GDP.
                  which(names(set)=="gdp.scaled"),        # 5
                  which(names(set)=="ptssum.mean"),       # 6
                  which(names(set)=="logsoldiers"),       # 7
                  which(names(set)=="nunn_desert"),       # 8
                  which(names(set)=="nunn_tropical"),     # 9
                  which(names(set)=="allunemp"),          # 10
                  which(names(set)=="maleunemp"),         # 11
                  which(names(set)=="eth"),               # 12
                  which(names(set)=="rel"),               # 13
                  which(names(set)=="loggrowth"),         # 14
                  which(names(set)=="polyarchy"),         # 15
                  which(names(set)=="libdem"),            # 16
                  which(names(set)=="partipdem"),		  # 17
                  which(names(set)=="delibdem"),	          # 18
			 	  which(names(set)=="egaldem"),           # 19
			 	  which(names(set)=="logsoldierspercent"),# 20
			 	  which(names(set)=="logpop"))            #21 

  prior.matrix <- matrix(c(rep(0, times = length(prior.vars)),
    prior.vars,
   # 1    2    3    4    5   6   7     8      9     10    11    12     13     14          15-19        20    21
    0.2, 0.1,  0,  0.1,  1 , 0,  0,  0.01,  0.1,  0.02,  0.02,  0.01,  0.01, -1.5, rep(0.1, times = 5), 0.1, 6,  # prior minimum
    0.8, 5.0, 15,  7.0, 32 , 5, 17,  0.20,  0.5,  0.90,  0.90,  0.99,  0.99,  1.5, rep(0.9, times = 5), 5,   22,  # prior maximum
    rep(.99, times = length(prior.vars))), # confidence level of the priors
    nrow = length(prior.vars), ncol = 5)

# Note: priors are based on inspection of the support of the data for each variable + countries where variables are missing.
                                                                
###############################################################################################################################
# Impute original data and post-process data set. Use parallelization. This won't work on Windows. Only on Mac.

  NumberCores <- max(availableCores()-2, 1, na.rm = T)

  nimps <- 10
  full <- amelia(x = set, m = nimps, ts ="quarter", cs ="region", 
          idvars = id.varnames, ords = ord.varnames, noms = nom.varnames, lgstc = logistic.varnames,
          polytime = 3,intercs = TRUE, priors = prior.matrix, empri = 0.01*nrow(set), autopri = 0.1, 
          p2s = 0, parallel = "multicore", npcus = NumberCores)
  
  # Examine missing data fraction
  writeLines("Imputation model finished. Summary below. MI data saved in MIforPlot.Rdata for overimputation plot.")
  #print(summary(full))
  
  # Clear memory to speed up the overimputation plots below.
  rm(data, set, id.vars, ordinals, nominals, percents, continuous.vars)
  gc()
  detach(data)
  save(full, file = "MIforPlot.Rdata")
 
###############################################################################################################################
# Process and examine the imputed data.

# Calculate the scaled aid vars.

  aidvars <- c("demaid.scaled", "multilataid.scaled", "imfcredit.scaled", "dod.scaled", "capital.scaled")
  columns.tag <- which(names(full$imputations[[1]]) %in% aidvars)

  ihst <- function(x){log(x+sqrt(x^2+1))} # Inverse hyperbolic sine function instead of log. Reverse with sinh().
  
  for (z in 1:nimps){
  	full$imputations[[z]][,columns.tag] <- ihst(sinh(full$imputations[[z]][,columns.tag])/sinh(full$imputations[[z]]$gdp.scaled))
  	}

# Create certain new variables for analysis in each of the imputed data sets.
  
  suppressPackageStartupMessages(library(caret)) # for the duymmyVars function
  
   for (z in 1:nimps){
   	 # Create dummies for legal traditions
   	   legaldummies <- as.data.frame(predict(dummyVars(~ as.factor(legaltrad2), data = full$imputations[[z]]), 
   	   newdata = full$imputations[[z]]))
       names(legaldummies) <- c("civil", "common", "mixed", "islam")
       full$imputations[[z]] <- data.frame(full$imputations[[z]], legaldummies)
       rm(legaldummies)
       }
       

   for (z in 1:nimps){       
     # Code a binary democracy variable at a high threshold of VDEM's additive polyarchy variable as an alternative measure of minimalist democracy.
       full$imputations[[z]]$democracy.cut <- full$imputations[[z]]$polyarchy>=0.5
       }
 
  for (z in 1:nimps){       
     # If a state has 0 soldiers, then expenditures per soldier will be infinite. More realistic is to code expenditures per soldier as 0 in this case.
       full$imputations[[z]]$soldiers<-sinh(full$imputations[[z]]$logsoldiers)
       #full$imputations[[z]]$soldiers<-ifelse(full$imputations[[z]]$soldiers<=0, 0, full$imputations[[z]]$soldiers)
     
     # Set the lower bound of military expenditures as a percent of GDP at 0.
       full$imputations[[z]]$expmil.corrected<-ifelse(sinh(full$imputations[[z]]$logexpmil)<0,0,sinh(full$imputations[[z]]$logexpmil))
       
     # Calculate "logexpmil": military expenditures divided by the number of soldiers. 
     # Note that this object replaces the old logexpmil, recording expenditures as a percent of GDP * 100.
       
       full$imputations[[z]]$logexpmil <- 
       ihst((full$imputations[[z]]$expmil.corrected/100)*(10^6*sinh(full$imputations[[z]]$gdp.scaled))/full$imputations[[z]]$soldiers) 
       # Note we have changed logexpmil from IHST(expenditures/GDP) to IHST(expenditures/soldiers)!
       # Next correct for cases where the number of soldiers is 0.
	   full$imputations[[z]]$logexpmil[full$imputations[[z]]$soldiers==0] <- 0
     }

# Identify the OECD DAC States so they can be dropped.
 dacmemberlist <- c("Australia", "Austria", "Belgium", "Canada", "Denmark", "Finland", "France", "Germany", "Greece", "Ireland", "Italy", "Japan", "South Korea", "Luxembourg", "Netherlands", "New Zealand", "Norway", "Portugal", "Spain", "Sweden", "Switzerland", "United Kingdom", "United States of America")

  # Exclude DAC member states and observations before quarter 154 (July 1998, the date of the Rome Conference)
  nimps <- 10
  for (z in 1:nimps){
  full$imputations[[z]] <- full$imputations[[z]][ifelse(full$imputations[[z]]$country %in% dacmemberlist, 1, 0)==0,]
  full$imputations[[z]] <- full$imputations[[z]][full$imputations[[z]]$quarter>=154, ]
  }
  
# Save imputed data:  

  if(SaveData == TRUE){
  	#filename <- gsub("\\s+","",paste("imputedicc", Sys.Date(), ".RData")) 
  	save(obj = full, file ="imputedicc.RData")
  	writeLines("Ten imputations saved as imputedicc.RData in your working directory. Source script for imputations is complete.")
  	}

# Examine missing data fraction
  summary(full)
  timestamp()
  rm(list = ls())
  

###############################################################################################################################
# End.


